TFLearn Fragment Detection

Catherine has prepared datafiles with sentences turned into fragments. I will use as input 60,000 fragments and 60,000 sentences. The fragments will come from the sentences. In the future the fragments will not be descendants of the input sentences. The labels will be either a 1 or 0, where 1 indicates a sentence and 0 indicates a fragment.

Install Dependencies


In [ ]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
nlp = spacy.load('en')
import re
from nltk.util import ngrams, trigrams
import csv

Load Datafiles


In [ ]:
texts = []
labels = []

with open("../.removingPOS/updatedSentences/conjunctionSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)
        
with open("../.removingPOS/updatedSentences/nounSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)

with open("../.removingPOS/updatedSentences/nounverbSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)
        
with open("../.removingPOS/updatedSentences/verbSentences/detailedRemoval.txt","r") as f:
    for line in f:
        asArray = line.split(" ||| ")
        fragment = asArray[2].strip()
        fragment = re.sub("\ \.", ".", fragment)
        fragment = re.sub("\,\.", ".", fragment)
        texts.append(fragment.capitalize())
        labels.append(0)
        texts.append(asArray[0].strip())
        labels.append(1)
        
print(texts[-10:])
Shuffle the data

In [ ]:
import random

combined = list(zip(texts,labels))
random.shuffle(combined)

texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])
Get parts of speech for text string

In [ ]:
def textStringToPOSArray(text):
    doc = nlp(text)
    tags = []
    for word in doc:
        tags.append(word.pos_)
    return tags

textStringToPOSArray(texts[3])
Get POS trigrams for a text string

In [ ]:
def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

def getPOSTrigramsForTextString(text):
    tags = textStringToPOSArray(text)
    tgrams = list(trigrams(tags))
    return tgrams

print("Text: ", texts[3], labels[3])
getPOSTrigramsForTextString(texts[3])
Turn Trigrams into Dict keys

In [ ]:
def trigramsToDictKeys(trigrams):
    keys = []
    for trigram in trigrams:
        keys.append('>'.join(trigram))
    return keys

print(texts[2])
print(trigramsToDictKeys(getPOSTrigramsForTextString(texts[2])))

In [ ]:
from collections import Counter

c = Counter()

for textString in texts:
    c.update(trigramsToDictKeys(getPOSTrigramsForTextString(textString)))

total_counts = c

print("Total words in data set: ", len(total_counts))

In [ ]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:1200]
print(vocab[:60])

In [ ]:
print(vocab[-1], ': ', total_counts[vocab[-1]])

Take the trigrams and index them


In [ ]:
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)

In [ ]:
def textToTrigrams(text): 
    return trigramsToDictKeys(getPOSTrigramsForTextString(text))

def text_to_vector(text):
    wordVector = np.zeros(len(vocab))
    for word in textToTrigrams(text):
        index = word2idx.get(word, None)
        if index != None:
            wordVector[index] += 1
    return wordVector

In [ ]:
text_to_vector('The tea is for a party to celebrate '
               'the movie so she has no time for a cake')[:65]

In [ ]:
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
    word_vectors[ii] = text_to_vector(text)

In [ ]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]

Chunking the data for TF


In [ ]:
records = len(labels)
test_fraction = 0.9

train_split, test_split = int(records*test_fraction), int(records*test_fraction)
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)

In [ ]:
trainX[-1], trainY[-1]

In [ ]:
len(trainY), len(testY), len(trainY) + len(testY)

Setting up TF


In [ ]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    #### Your code ####
    net = tflearn.input_data([None, len(vocab)])                          # Input
    net = tflearn.fully_connected(net, 200, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 25, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 2, activation='softmax')   # Output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    model = tflearn.DNN(net)

    return model

In [ ]:
len(vocab)

Initialize


In [ ]:
model = build_model()

Training


In [ ]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)

In [ ]:
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Playground


In [ ]:
def test_sentence(sentence):
    positive_prob = model.predict([text_to_vector(sentence)])[0][1]
    print('Sentence: {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

In [ ]:
test_sentence("Even though he had the better arguments and was by far the more powerful speaker.")

In [ ]:
test_sentence("Even though he had the better arguments and was by far the more powerful speaker, Peter lost the debate.")

In [ ]:
test_sentence("Working far into the night in an effort to salvage her little boat.")

In [ ]:
test_sentence("She was working far into the night in an effort to salvage her little boat.")

In [ ]:
test_sentence("The man eating pizza.")

In [ ]:
test_sentence("The man eating pizza is overwieght.")

In [ ]:
test_sentence("While we were swimming at the lake.")

In [ ]:
test_sentence("While we were swimming at the lake, we saw a fish.")

In [ ]:
test_sentence("Keep going.")

In [ ]:
test_sentence("A time of wonder and amazement")

In [ ]:
test_sentence("That was a time of wonder and amazement")

In [ ]:
test_sentence("Since she never saw that movie.")

In [ ]:
test_sentence("We should invite her, since she never saw that movie.")

In [ ]:
test_sentence("Affecting the lives of many students in New York City.")

In [ ]:
test_sentence("Quill is affecting the lives of many students in New York City.")

In [ ]:
test_sentence("Standing on the edge of the cliff looking down.")

In [ ]:
test_sentence("I'm standing on the edge of the cliff and looking down.")

In [ ]:
test_sentence("The team looked forward to victory.")

In [ ]:
model.save("./model.tfl")

Save the vocab


In [ ]:
w = csv.writer(open("./vocabindex.csv", "w"))
for key, val in word2idx.items():
    w.writerow([key, val])

In [ ]:
vocab

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: